# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from scr_redis.items import LaGouItem
import re
import time
from datetime import datetime
from datetime import timedelta

class BaiduSpider(RedisCrawlSpider):   #继承RedisCrawlSpider 类
    name = 'lagou'
    allowed_domains = ['lagou.com']
    # start_urls = ['http://www.baidu.com/']
    redis_key = 'start_url'   #设置redis键名启动！！！！！！！！！！！！！！！

    rules = (
        # Rule(LinkExtractor(allow=r''), callback='parse_item', follow=True),
        # #搜索
        Rule(LinkExtractor(allow=(r'lagou.com/jobs/list_',), tags=('form',), attrs=('action',)), follow=True),
        # #公司招聘
        Rule(LinkExtractor(allow=(r'lagou\.com/gongsi/',), tags=('a',), attrs=('href',)), follow=True),
        # 公司列表
        Rule(LinkExtractor(allow=(r'/gongsi/j\d+\.html',), tags=('a',), attrs=('href',)), follow=True),
        # 校园招聘
        Rule(LinkExtractor(allow=(r'xiaoyuan\.lagou\.com',), tags=('a',), attrs=('href',)), follow=True),
        # 匹配校园分类
        Rule(LinkExtractor(allow=(r'isSchoolJob',), tags=('a',), attrs=('href',)), follow=True),
        # # 详情页
        Rule(LinkExtractor(allow=(r'jobs/\d+\.html',), tags=('a',), attrs=('href',)), callback='parse_item',
             follow=False),
    )

    num_pattern = re.compile(r'\d+') # 提取数字正则
    custom_settings = {
         'DEFAULT_REQUEST_HEADERS' : {
            "Host": "www.lagou.com",
            "Connection": "keep-alive",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
            "Content-type": "application/json;charset=utf-8",
            "Accept": "*/*",
            "Referer": "https://www.lagou.com",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cookie": "user_trace_token=20171116192426-b45997e2-cac0-11e7-98fd-5254005c3644; LGUID=20171116192426-b4599a6d-cac0-11e7-98fd-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAAGFABEFC0E3267F681504E5726030548F107348; _gat=1; X_HTTP_TOKEN=d8b7e352a862bb108b4fd1b63f7d11a7; _gid=GA1.2.1718159851.1510831466; _ga=GA1.2.106845767.1510831466; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510836765,1510836769,1510837049,1510838482; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510839167; LGSID=20171116204415-da8c7971-cacb-11e7-930c-525400f775ce; LGRID=20171116213247-a2658795-cad2-11e7-9360-525400f775ce",
        },
        'COOKIES_ENABLED' : False,
        'CONCURRENT_REQUESTS' : 5,
    }

    def parse_item(self, response):
        item = LaGouItem()
        title = response.css('span.name::text').extract()[0]
        url = response.url
        spans = response.xpath('//dd[@class="job_request"]//span')
        salary = spans[0].css('span::text').extract()[0] #薪资
        city =self.splits(spans[1].css('span::text').extract()[0])#工作城市
        start,end= self.asks(self.splits(spans[2].css('span::text').extract()[0] ))#经验
        edu = self.splits(spans[3].css("span::text").extract()[0] ) #学历
        job_type = spans[4].css('span::text').extract()[0] #工作类型

        label = "-".join(response.xpath('//ul[@class="position-label clearfix"]//li/text()').extract()) #标签
        publish_time =self.times(response.xpath('//p[@class="publish_time"]//text()').extract()[0].strip('\xa0 发布于拉勾网')) #发布时间
        tempy = response.xpath('//dd[@class="job-advantage"]//p/text()').extract()[0]  #在职业诱惑
        discription =''.join([''.join(i.split()) for i in response.xpath('//dd[@class="job_bt"]//div//text()').extract()]) #岗位职责
        addr = '-'.join(response.xpath('//div[@class="work_addr"]//a/text()').extract()[:-1])
        address = ''.join(  ''.join(i.split()) for i in response.xpath('//div[@class="work_addr"]/text()').extract())
        loction= addr+address  #详细工作地址


        #装载数据
        item["title"] = title
        item["url"] = url
        item["salary"] = salary
        item["city"] = city
        item["start"] = start
        item["end"] = end
        item["edu"] = edu
        item["job_type"] = job_type
        item["label"] = label
        item["publish_time"] = publish_time
        item["tempy"] = tempy
        item["discription"] = discription
        item["loction"] = loction
        return item



    #去斜杠
    def splits(self,value):
        result =value.strip('/')
        return result

    def asks(self,value):
        if '不限' in value:
            start = 0
            end = 0
        elif '以下' in value :
            res = self.num_pattern.search(value)
            start  =  res.group()
            end = res.group()
        else:
            res = self.num_pattern.findall(value)
            start  = res[0]
            end = res[1]
        return  start,end
    #统一日期格式
    def times(self,value):
        if ':' in value:
            times=datetime.now().strftime('%Y-%m-%d')
        elif '天前' in value:
            res = self.num_pattern.search(value).group()
            times = (datetime.now() - timedelta(days=int(res))).strftime('%Y-%m-%d')
        else :
            times = value
        return times
